from unsloth import GRPOTrainer
from datasets import load_dataset
ds = load_dataset("json", data_files="data/rl.jsonl", split="train")

trainer = GRPOTrainer(
    model="ckpt/r1-sft1",
    reward_funcs=[reward_fn],
    train_dataset=ds,
    max_prompt_length=512,
    max_completion_length=1536,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    learning_rate=1e-5,
    beta=0.04,
    output_dir="ckpt/r1-rl1"
)
trainer.train()
trainer.save_model("ckpt/r1-rl1")
